demographics
nrow(demo1_dis); nrow(demo1_val)
## [1] 29
## [1] 33
table(demo1_dis$calculated_Response); table(demo1_val$calculated_Response)
##
## DR ER
## 14 15
##
## DR ER
## 24 9
table(demo1_val$SEX, demo1_val$calculated_Response)
##
## DR ER
## F 11 5
## M 13 4
## [1] 0.4583333
## [1] 0.5555556
table(as.character(demo1_val$Allergen_cleanLabel), demo1_val$calculated_Response)
##
## DR ER
## Cat 9 7
## Grass 4 2
## HDM 8 0
## Horse 1 0
## Ragweed 2 0
table(demo1_val$SITE, demo1_val$calculated_Response)
##
## DR ER
## LAVAL 9 6
## MAC 12 1
## UBC 3 2
## Weight
fit <- lm(demo1_val$Wt..Kg. ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.9741, p-value = 0.6561
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$Wt..Kg. ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.709 -8.745 -1.209 9.416 20.405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.209 2.480 29.113 <2e-16 ***
## demo1_val$calculated_ResponseER -3.214 4.803 -0.669 0.509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.63 on 28 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.01574, Adjusted R-squared: -0.01941
## F-statistic: 0.4478 on 1 and 28 DF, p-value: 0.5089
## height
fit <- lm(demo1_val$HT.cm. ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.96828, p-value = 0.4534
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$HT.cm. ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.154 -6.393 -2.132 8.129 16.846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 171.154 1.883 90.872 <2e-16 ***
## demo1_val$calculated_ResponseER -3.044 3.767 -0.808 0.425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.227 on 30 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.02131, Adjusted R-squared: -0.01132
## F-statistic: 0.6531 on 1 and 30 DF, p-value: 0.4254
## Age
fit <- lm(demo1_val$AGE ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## not normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.87783, p-value = 0.001487
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$AGE[demo1_val$calculated_Response == "ER"],
y = demo1_val$AGE[demo1_val$calculated_Response == "DR"]) # pw=0.35
##
## Wilcoxon rank sum test with continuity correction
##
## data: demo1_val$AGE[demo1_val$calculated_Response == "ER"] and demo1_val$AGE[demo1_val$calculated_Response == "DR"]
## W = 131.5, p-value = 0.3518
## alternative hypothesis: true location shift is not equal to 0
## BLFEV
fit <- lm(demo1_val$BLFEV ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.97164, p-value = 0.5268
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$BLFEV ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4350 -0.4322 -0.1750 0.3678 1.6150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4950 0.1518 23.023 <2e-16 ***
## demo1_val$calculated_ResponseER -0.4428 0.2907 -1.523 0.138
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 31 degrees of freedom
## Multiple R-squared: 0.06964, Adjusted R-squared: 0.03962
## F-statistic: 2.32 on 1 and 31 DF, p-value: 0.1378
## PRFEV
fit <- lm(demo1_val$PRFEV ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.9316, p-value = 0.06768
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$PRFEV ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.298 -0.478 -0.188 0.682 1.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.67800 0.15441 23.820 <2e-16 ***
## demo1_val$calculated_ResponseER -0.00925 0.28887 -0.032 0.975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6905 on 26 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 3.944e-05, Adjusted R-squared: -0.03842
## F-statistic: 0.001025 on 1 and 26 DF, p-value: 0.9747
## EAR
fit <- lm(demo1_val$EAR ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.96757, p-value = 0.4162
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$EAR ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.312 -4.812 1.387 6.688 15.188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.388 1.829 -19.891 <2e-16 ***
## demo1_val$calculated_ResponseER 4.465 3.503 1.275 0.212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.962 on 31 degrees of freedom
## Multiple R-squared: 0.04981, Adjusted R-squared: 0.01916
## F-statistic: 1.625 on 1 and 31 DF, p-value: 0.2119
## LAR
fit <- lm(demo1_val$LAR ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## not normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.92725, p-value = 0.02924
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$LAR[demo1_val$calculated_Response == "ER"],
y = demo1_val$LAR[demo1_val$calculated_Response == "DR"]) # p=1.388e-05
##
## Wilcoxon rank sum test with continuity correction
##
## data: demo1_val$LAR[demo1_val$calculated_Response == "ER"] and demo1_val$LAR[demo1_val$calculated_Response == "DR"]
## W = 216, p-value = 1.388e-05
## alternative hypothesis: true location shift is not equal to 0
## PC20
fit <- lm(demo1_val$PC20 ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## not normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.89127, p-value = 0.003187
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$PC20[demo1_val$calculated_Response == "ER"],
y = demo1_val$PC20[demo1_val$calculated_Response == "DR"]) # pw=0.49
##
## Wilcoxon rank sum test
##
## data: demo1_val$PC20[demo1_val$calculated_Response == "ER"] and demo1_val$PC20[demo1_val$calculated_Response == "DR"]
## W = 126, p-value = 0.4862
## alternative hypothesis: true location shift is not equal to 0
## AIS
fit <- lm(demo1_val$AIS ~ demo1_val$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.92873, p-value = 0.05099
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = demo1_val$AIS ~ demo1_val$calculated_Response)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4558 -0.9803 -0.5469 1.0149 4.9393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9178 0.3342 8.730 2.4e-09 ***
## demo1_val$calculated_ResponseER -1.5656 0.7348 -2.131 0.0424 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.603 on 27 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.1439, Adjusted R-squared: 0.1122
## F-statistic: 4.54 on 1 and 27 DF, p-value: 0.04238
# variables that are NOT normally distributed
demo1_val %>% dplyr::select(AGE, LAR, PC20, calculated_Response) %>%
gather(DEMO, Value, -calculated_Response) %>%
group_by(DEMO, calculated_Response) %>%
dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2),
Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 6 x 5
## # Groups: DEMO [?]
## DEMO calculated_Response Median Q1 Q3
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 AGE DR 25.5 21.8 41
## 2 AGE ER 31 27 36
## 3 LAR DR -23.8 -30.9 -17.5
## 4 LAR ER -7.2 -11.1 -4.9
## 5 PC20 DR 1.58 0.69 4.08
## 6 PC20 ER 5.13 0.64 5.45
# variables that are normally distributed
demo1_val %>% dplyr::select(Wt..Kg., HT.cm., BLFEV, PRFEV, EAR, AIS, calculated_Response) %>%
gather(DEMO, Value, -calculated_Response) %>%
group_by(DEMO, calculated_Response) %>%
summarise(Mean = round(mean(Value, na.rm = TRUE), 2), SD = round(sd(Value, na.rm = TRUE), 2))
## # A tibble: 12 x 4
## # Groups: DEMO [?]
## DEMO calculated_Response Mean SD
## <chr> <chr> <dbl> <dbl>
## 1 AIS DR 2.92 1.69
## 2 AIS ER 1.35 1.13
## 3 BLFEV DR 3.5 0.81
## 4 BLFEV ER 3.05 0.51
## 5 EAR DR -36.4 8.83
## 6 EAR ER -31.9 9.32
## 7 HT.cm. DR 171. 8.91
## 8 HT.cm. ER 168. 10.2
## 9 PRFEV DR 3.68 0.62
## 10 PRFEV ER 3.67 0.84
## 11 Wt..Kg. DR 72.2 11.6
## 12 Wt..Kg. ER 69 11.9
## post PC20
demo <- readRDS(meta_file_rnapc_dir)
demoPost <- subset(demo, Time == "Post")
demoPostval <- demoPost[paste(as.character(demoPost$NAME), as.character(demoPost$AIC_YMD), sep = "_") %in% paste(as.character(demo1_val$NAME), as.character(demo1_val$AIC_YMD), sep = "_"), ]
table(demoPostval$calculated_Response)
##
## DR ER
## 24 9
fit <- lm(demoPostval$PC20 ~ demoPostval$calculated_Response)
plot(fit)




shapiro.test(fit$residuals) ## NOT normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.73553, p-value = 7.1e-06
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demoPostval$PC20[demoPostval$calculated_Response == "ER"],
y = demoPostval$PC20[demoPostval$calculated_Response == "DR"]) # p=0.04
##
## Wilcoxon rank sum test
##
## data: demoPostval$PC20[demoPostval$calculated_Response == "ER"] and demoPostval$PC20[demoPostval$calculated_Response == "DR"]
## W = 108, p-value = 0.03567
## alternative hypothesis: true location shift is not equal to 0
demoPostval %>% dplyr::select(PC20, calculated_Response) %>%
gather(DEMO, Value, -calculated_Response) %>%
group_by(DEMO, calculated_Response) %>%
dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2),
Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 2 x 5
## # Groups: DEMO [?]
## DEMO calculated_Response Median Q1 Q3
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 PC20 DR 0.76 0.34 1.55
## 2 PC20 ER 5.11 2.17 7.98
create cbc dataset and phenotype vectors
y.train_cbc <- factor(demo1_dis$calculated_Response, c("ER", "DR"))
names(y.train_cbc ) <- rownames(demo1_dis)
y.test_cbc <- factor(demo1_val$calculated_Response, c("ER", "DR"))
names(y.test_cbc ) <- rownames(demo1_val)
table(y.train_cbc ); table(y.test_cbc )
## y.train_cbc
## ER DR
## 15 14
## y.test_cbc
## ER DR
## 9 24
cbc_train <- na.omit(demo1_dis[, c("PC20", "Leukocyte_Counts.x10.9.", "Neu_percent","lym_percent","mono_percent","eos_percent","baso_percent")])
y.train_cbc <- y.train_cbc[rownames(cbc_train)]
cbc_test <- na.omit(demo1_val[, c("PC20", "Leukocyte_Counts.x10.9.", "Neu_percent","lym_percent","mono_percent","eos_percent","baso_percent")])
y.test_cbc <- y.test_cbc[rownames(cbc_test)]
table(y.test_cbc)
## y.test_cbc
## ER DR
## 9 20
## statistical tests for cell-types
## which cell frequencies signficantly changes between er and dr at pre-challenge
## Leukocytes
fit <- lm(cbc_test$Leukocyte_Counts.x10.9. ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.97067, p-value = 0.578
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = cbc_test$Leukocyte_Counts.x10.9. ~ y.test_cbc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9230 -0.7433 -0.2230 0.6770 3.0770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.1333 0.4817 10.657 3.56e-11 ***
## y.test_cbcDR 0.9897 0.5800 1.706 0.0994 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.445 on 27 degrees of freedom
## Multiple R-squared: 0.09734, Adjusted R-squared: 0.06391
## F-statistic: 2.912 on 1 and 27 DF, p-value: 0.09943
## Neutrophils
fit <- lm(cbc_test$Neu_percent ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.97585, p-value = 0.725
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = cbc_test$Neu_percent ~ y.test_cbc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15300 -0.06359 0.00000 0.07041 0.17900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.58200 0.02841 20.487 <2e-16 ***
## y.test_cbcDR -0.07241 0.03421 -2.117 0.0436 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08522 on 27 degrees of freedom
## Multiple R-squared: 0.1423, Adjusted R-squared: 0.1106
## F-statistic: 4.481 on 1 and 27 DF, p-value: 0.04363
## Lymphocytes
fit <- lm(cbc_test$lym_percent ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.94436, p-value = 0.1304
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = cbc_test$lym_percent ~ y.test_cbc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12822 -0.06171 0.01229 0.05629 0.11278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.29222 0.02442 11.967 2.64e-12 ***
## y.test_cbcDR 0.07549 0.02940 2.567 0.0161 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07326 on 27 degrees of freedom
## Multiple R-squared: 0.1962, Adjusted R-squared: 0.1664
## F-statistic: 6.59 on 1 and 27 DF, p-value: 0.01611
## Monocytes
fit <- lm(cbc_test$mono_percent ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # normal
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.96399, p-value = 0.4103
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

##
## Call:
## lm(formula = cbc_test$mono_percent ~ y.test_cbc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.033112 -0.007312 0.000888 0.011888 0.025022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.074978 0.005136 14.598 2.48e-14 ***
## y.test_cbcDR 0.003134 0.006185 0.507 0.616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01541 on 27 degrees of freedom
## Multiple R-squared: 0.009421, Adjusted R-squared: -0.02727
## F-statistic: 0.2568 on 1 and 27 DF, p-value: 0.6164
## Eosinophils
fit <- lm(cbc_test$eos_percent ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # NOT normal!
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.82529, p-value = 0.0002458
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = cbc_test$eos_percent[y.test_cbc == "ER"],
y = cbc_test$eos_percent[y.test_cbc == "DR"]) # pw=0.70
##
## Wilcoxon rank sum test with continuity correction
##
## data: cbc_test$eos_percent[y.test_cbc == "ER"] and cbc_test$eos_percent[y.test_cbc == "DR"]
## W = 98.5, p-value = 0.7041
## alternative hypothesis: true location shift is not equal to 0
## Basophils
fit <- lm(cbc_test$baso_percent ~ y.test_cbc)
plot(fit)




shapiro.test(fit$residuals) # NOT normal!
##
## Shapiro-Wilk normality test
##
## data: fit$residuals
## W = 0.67347, p-value = 8.952e-07
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = cbc_test$baso_percent[y.test_cbc == "ER"],
y = cbc_test$baso_percent[y.test_cbc == "DR"]) # pw=0.17
##
## Wilcoxon rank sum test with continuity correction
##
## data: cbc_test$baso_percent[y.test_cbc == "ER"] and cbc_test$baso_percent[y.test_cbc == "DR"]
## W = 60.5, p-value = 0.1683
## alternative hypothesis: true location shift is not equal to 0
# variables that are normally distributed
cbc_test %>% dplyr::select(Leukocyte_Counts.x10.9., Neu_percent, lym_percent, mono_percent) %>%
mutate(calculated_Response = as.character(y.test_cbc)) %>%
gather(Cells, Value, -calculated_Response) %>%
group_by(Cells, calculated_Response) %>%
summarise(Mean = round(mean(Value, na.rm = TRUE), 2), SD = round(sd(Value, na.rm = TRUE), 2))
## # A tibble: 8 x 4
## # Groups: Cells [?]
## Cells calculated_Response Mean SD
## <chr> <chr> <dbl> <dbl>
## 1 Leukocyte_Counts.x10.9. DR 6.12 1.52
## 2 Leukocyte_Counts.x10.9. ER 5.13 1.25
## 3 lym_percent DR 0.37 0.07
## 4 lym_percent ER 0.290 0.08
## 5 mono_percent DR 0.08 0.01
## 6 mono_percent ER 0.07 0.02
## 7 Neu_percent DR 0.51 0.08
## 8 Neu_percent ER 0.580 0.1
# variables that are NOT normally distributed
cbc_test %>% dplyr::select(eos_percent, baso_percent) %>%
mutate(calculated_Response = as.character(y.test_cbc)) %>%
gather(Cells, Value, -calculated_Response) %>%
group_by(Cells, calculated_Response) %>%
dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2),
Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups: Cells [?]
## Cells calculated_Response Median Q1 Q3
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 baso_percent DR 0.01 0.005 0.01
## 2 baso_percent ER 0 0.0021 0.006
## 3 eos_percent DR 0.04 0.02 0.0449
## 4 eos_percent ER 0.03 0.024 0.059
Plot AUROC
aucDat <- rbind(do.call(rbind, pos_hk_elasticNet), do.call(rbind, pos_hk_rf))
aucDat$Panel[aucDat$Panel == "ucscGenes"] <- "UCSC genes"
aucDat$Panel[aucDat$Panel == "ucscGeneIso"] <- "UCSC gene-isoforms"
aucDat$Panel[aucDat$Panel == "ensembl"] <- "Ensembl"
aucDat$Panel[aucDat$Panel == "trinity"] <- "Trinity"
aucDat$Panel[aucDat$Panel == "all"] <- "Combined"
aucDat$Panel[aucDat$Panel == "clinical"] <- "Clinical"
aucDat$Panel[aucDat$Panel == "ucscGenes_clinical"] <- "UCSC genes+clinical"
aucDat$Panel[aucDat$Panel == "ucscGeneIso_clinical"] <- "UCSC gene-isoforms+clinical"
aucDat$Panel[aucDat$Panel == "ensembl_clinical"] <- "Ensembl+clinical"
aucDat$Panel[aucDat$Panel == "trinity_clinical"] <- "Trinity+clinical"
aucDat$Panel[aucDat$Panel == "all_clinical"] <- "Combined+clinical"
aucDat$Panel <- factor(aucDat$Panel,
levels = c("Clinical", "UCSC genes", "UCSC gene-isoforms", "Ensembl", "Trinity", "Combined",
"UCSC genes+clinical", "UCSC gene-isoforms+clinical", "Ensembl+clinical", "Trinity+clinical", "Combined+clinical"))
aucDat$PanelType <- unlist(lapply(strsplit(as.character(aucDat$Panel), "\\+"), function(i) i[1]))
aucDat$Cohort <- "nanoString - Validation"
aucDat$ClassifierType <- rep("Molecular", nrow(aucDat))
aucDat$ClassifierType[grep("+clinical", aucDat$Panel)] <- "Molecular+Clinical"
aucDat$ClassifierType[grep("Clinical", aucDat$Panel)] <- "Clinical"
aucDat$PanelType <- factor(aucDat$PanelType, levels = c("Clinical", "UCSC genes", "UCSC gene-isoforms", "Ensembl", "Trinity", "Combined"))
## AUC records
aucLabel <- aucDat %>% dplyr::select(Panel, Classifier, PanelType, ClassifierType, AUC, CI, Cohort) %>%
group_by(Panel, Classifier, Cohort) %>% slice(1)
ci <- as.data.frame(do.call(rbind, strsplit(gsub("95% CI:", "", aucLabel$CI), "-")))
aucLabel$min <- as.numeric(as.character(ci$V1))
aucLabel$max <- as.numeric(as.character(ci$V2))
aucLabel$auc <- as.numeric(unlist(lapply(strsplit(aucLabel$AUC, "= "), function(i) i[2])))
aucLabel$ClassifierType <- rep("Molecular", nrow(aucLabel))
aucLabel$ClassifierType[grep("+clinical", aucLabel$Panel)] <- "Molecular+Clinical"
aucLabel$ClassifierType[grep("Clinical", aucLabel$Panel)] <- "Clinical"
aucLabel$x <- 0.75
aucLabel$y <- rep(0.1, nrow(aucLabel))
aucLabel$y[aucLabel$ClassifierType == "Molecular+Clinical"] <- 0.25
aucLabel$y.ci <- rep(0.05, nrow(aucLabel))
aucLabel$y.ci[aucLabel$ClassifierType == "Molecular+Clinical"] <- 0.2
colPalette <- c("dodgerblue", "black")
pdf(paste0(rnaelements_dir, "/results/Figures/Figure4/Figure4b_revised.pdf"), width = 15, height = 6.5)
ggplot(aucDat, aes(x = FPR, y = TPR, color = ClassifierType)) +
geom_rect(data = subset(aucDat, PanelType == "Trinity"),
aes(fill = PanelType),xmin = -Inf,xmax = Inf,
ymin = -Inf,ymax = Inf, alpha = 0.01, color = "#FC8D62") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", col = "black") +
geom_point() + geom_line() +
facet_grid(Classifier ~ PanelType) +
customTheme(sizeStripFont = 12, xAngle = 0, hjust = 0.5, vjust = 0.5,
xSize = 10, ySize = 10, xAxisSize = 10, yAxisSize = 10) +
#scale_color_manual(values=colPalette, name = "Panel") +
geom_text(data = aucLabel, aes(x = x, y = y, label = AUC), size = 3) +
geom_text(data = aucLabel, aes(x = x, y = y.ci, label = CI), size = 3) +
xlab("False Positive Rate") + ylab("True Positive Rate")
dev.off()
## png
## 2
apply Trinity panel to all subjects of the discovery and validation cohort
colMeans(pos_hk_demo1_disList$trinity[y.train$trinity == "DR", ])-colMeans(pos_hk_demo1_disList$trinity[y.train$trinity == "ER", ])
## CASP8 CECR1
## -0.24248974 0.03176156
## FNIP1 FPR2
## -0.33047723 -0.17945569
## LYST QKI
## -0.10204956 -0.15426356
## SETX SF3B1
## -0.29872154 -0.34005866
## unknown1_comp54405_c1_seq1 unknown2_comp55647_c0_seq2
## -0.43329914 -0.09372008
## unknown2a_comp55647_c0_seq2 unknown3_comp56590_c0_seq8
## -0.21498836 -0.14155401
## FPR1_intron_comp17070_c0_seq1 IFRD1_intron_comp41141_c0_seq1
## -0.19950686 -0.18606152
set.seed(121)
## Elastic net
fit <- enet(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity, alpha = pos_hk_elasticNet_lambda["trinity"], lambda = NULL,
family = "binomial", X.test = pos_hk_demo1_valList$trinity, Y.test = y.test$trinity,
filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)
## using a cut-off of 0.5
pred <- rep(NA, length(fit$probs))
pred[fit$probs >= 0.5] <- "DR"
pred[fit$probs < 0.5] <- "ER"
table(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity)
## truth
## pred ER DR
## ER 4 5
## DR 5 19
calcPerf(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity, prev = 0.6)
## sens spec npv ppv accuracy
## 0.7916667 0.4444444 0.5871560 0.7276596 0.6969697
# tune cut-off
#fit$perfTest
## random forest
fit <- rforest(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity,
family = "binomial", pos_hk_demo1_valList$trinity, Y.test = y.test$trinity,
filter = "none", topranked = 10)
## using a cut-off of 0.5
pred <- rep(NA, length(fit$probs))
pred[fit$probs >= 0.5] <- "DR"
pred[fit$probs < 0.5] <- "ER"
table(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity)
## truth
## pred ER DR
## ER 2 3
## DR 7 21
calcPerf(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity, prev = 0.6)
## sens spec npv ppv accuracy
## 0.8750000 0.2222222 0.5423729 0.8552036 0.6969697
# tune cut-off
#fit$perfTest